package uc.files.filelist;
import helpers.GH;
import helpers.SizeEnum;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import logger.LoggerFactory;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.IStatus;
import org.eclipse.core.runtime.Status;
import org.eclipse.core.runtime.jobs.Job;
import uc.LanguageKeys;
import uc.PI;
import uc.crypto.HashValue;
public class TextIndexer {
private static final Logger logger = LoggerFactory.make();
// private static final int MAX_TOTALSIZE = 100*1024*1024; //max size to be indexed..
private static final int MAX_RAMSIZE_FOR_PDF = 15 * 1024 * 1024; // max size to be held in ram..
private static final String FIELD_HASH = "hash",FIELD_CONTENT = "contents",FIELD_ENDING = "ending"; //contens field -> from PDFReader
private static final Set<String> SUPPORTED_ENDINGS =
new HashSet<String>(Arrays.asList("txt","nfo","pdf"));
public static boolean matchesSomeEnding(Collection<String> endings) {
for (String s:endings) {
if (SUPPORTED_ENDINGS.contains(s)) {
return true;
}
}
return false;
}
private final SimpleAnalyzer analyzer = new SimpleAnalyzer(Version.LUCENE_34);
private final Directory index;
private IndexWriter w;
private final File scratch;
private RandomAccessFile scratchRaf;
private final Set<HashValue> presentHashes = new HashSet<HashValue>();
private volatile boolean created = false;
private final File dir;
private IndexTextFiles job;
public TextIndexer() throws IOException {
System.setProperty( "org.apache.lucene.FSDirectory.class", "org.apache.lucene.store.FSDirectory");
//have to do that -> sadly because of indexer uses it and in 0.79 was set to nioFS which does not exist in old lucene..
dir = new File(new File(PI.getStoragePath(),"db"),"textindex");
File lockfile = new File(dir,"write.lock");
if (lockfile.exists() && !lockfile.delete()) {
lockfile.deleteOnExit();
}
scratch = new File(dir,"scratch");
if (scratch.isFile()) {
scratch.deleteOnExit();
}
index = new NIOFSDirectory(dir);
}
public void init(OwnFileList list) {
try {
boolean createNew = !dir.isDirectory()
|| dir.listFiles().length == 0;
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer);
// iwc.setRAMBufferSizeMB(10);
w = new IndexWriter(index, iwc);
w.commit();
if (!createNew) {
IndexSearcher searcher = new IndexSearcher(index,true);
IndexReader ir = searcher.getIndexReader();
int size = ir.numDocs();
long timestart = System.currentTimeMillis();
logger.debug("Read "+size+" filehashes");
FieldSelector fs = new HashFieldSelector();
for (int i = 0; i < size; i++) {
Document doc = ir.document(i, fs);
byte[] hashB = doc.getBinaryValue(FIELD_HASH);
HashValue hash = HashValue.createHash(hashB);
presentHashes.add(hash);
}
searcher.close();
logger.debug("info time needed: "+(System.currentTimeMillis()-timestart));
}
index(list);
} catch (FileNotFoundException fnfe) {
logger.debug("file not found..", fnfe);
} catch (Exception e) {
logger.warn(e, e);
}
}
public static boolean matches(String filename,long filesize) {
String ending = GH.getFileEnding(filename).toLowerCase();
return SUPPORTED_ENDINGS.contains(ending) && 0 < filesize ; // && filesize <= MAX_TOTALSIZE ;
}
public synchronized void addPDFIfAbsent(File f,HashValue hashOfFile) throws IOException {
if (exists(hashOfFile)) {
return;
}
if (!matches(f.getName(),f.length())) {
return;
}
try {
storeDocument(f, hashOfFile);
presentHashes.add(hashOfFile);
} catch (CorruptIndexException e) {
logger.warn(e, e);
} catch (IOException e) {
logger.warn(e, e);
}
}
private boolean exists(HashValue hash) {
return presentHashes.contains(hash);
}
public synchronized Set<HashValue> search(Set<String> keys,Set<String> excludes,Collection<String> endings) {
if (presentHashes.isEmpty()) { //if inverted Index is empty .. -> no results..
return Collections.<HashValue>emptySet();
}
BooleanQuery bq = new BooleanQuery();
for (String s : keys) {
if (s.contains(" ")) {
PhraseQuery pq = new PhraseQuery();
for (String subterm: s.split(" ")) {
pq.add(new Term(FIELD_CONTENT,subterm));
}
bq.add(pq, BooleanClause.Occur.MUST);
} else {
bq.add(new TermQuery(new Term(FIELD_CONTENT, s)), BooleanClause.Occur.MUST);
}
}
for (String s : excludes) {
if (s.contains(" ")) {
PhraseQuery pq = new PhraseQuery();
for (String subterm: s.split(" ")) {
pq.add(new Term(FIELD_CONTENT,subterm));
}
bq.add(pq, BooleanClause.Occur.MUST_NOT);
} else {
bq.add(new TermQuery(new Term(FIELD_CONTENT, s)), BooleanClause.Occur.MUST_NOT);
}
}
if (!endings.isEmpty()) {
BooleanQuery equery = new BooleanQuery();
for (String s: endings) {
equery.add(new TermQuery(new Term(FIELD_ENDING, s)), BooleanClause.Occur.SHOULD);
}
bq.add(equery, BooleanClause.Occur.MUST);
}
Set<HashValue> found = new HashSet<HashValue>();
try {
IndexSearcher searcher = new IndexSearcher(index,true);
TopScoreDocCollector collector = TopScoreDocCollector.create(25, false); // new TopDocCollector(10);
searcher.search(bq, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
for (ScoreDoc sd: hits) {
int docId = sd.doc;
Document d = searcher.doc(docId);
found.add(HashValue.createHash(d.getBinaryValue(FIELD_HASH)));
}
searcher.close();
} catch(Exception e) {
logger.warn(e,e);
}
return found;
}
public synchronized void stop() {
if (w != null) {
try {
blockingstop();
w.commit();
w.close();
w = null;
} catch (Exception e) {
logger.error("Problem creating the FileListindex: "+e,e);
}
}
}
private void blockingstop() {
while (job != null) {
job.cancel();
try {
wait(100);
} catch (InterruptedException e) {
logger.warn(e,e);
}
}
}
private void index(OwnFileList list) {
blockingstop();
job = new IndexTextFiles(list);
job.schedule();
}
private static final class HashFieldSelector implements FieldSelector {
private static final long serialVersionUID = 1L;
public FieldSelectorResult accept(String fieldName) {
if (fieldName.equals(FIELD_HASH)) {
return FieldSelectorResult.LOAD;
} else {
return FieldSelectorResult.NO_LOAD;
}
}
}
class IndexTextFiles extends Job {
private final IOwnFileList list;
public IndexTextFiles(IOwnFileList list) {
super("Indexing Textfiles");
this.list = list;
}
@Override
protected IStatus run(IProgressMonitor monitor) {
String debugcurrent = "";
try {
List<FileListFile> pdfFiles = new ArrayList<FileListFile>();
for (FileListFile file: list.getFileList().getRoot()) {
if (matches(file.getName(),file.getSize())) {
pdfFiles.add(file);
}
}
monitor.beginTask(LanguageKeys.IndexingTextfiles, pdfFiles.size());
logger.debug("Files total: "+pdfFiles.size());
for (FileListFile file: pdfFiles) {
File f = null;
f = list.getFile(file.getTTHRoot()); //checks if its still there in current filelist
if (f != null) {
debugcurrent = file.getName() +" "+file.getSize();
synchronized(TextIndexer.this) {
if (monitor.isCanceled()) {
return Status.CANCEL_STATUS;
}
if (!exists(file.getTTHRoot())) {
monitor.subTask(String.format("%s (%s)",file.getName(),SizeEnum.getReadableSize(file.getSize())));
addPDFIfAbsent(f, file.getTTHRoot());
}
}
}
monitor.worked(1);
}
if (scratchRaf != null) {
scratchRaf.close();
scratch.delete();
}
} catch (Throwable e) {
logger.warn(e + debugcurrent, e);
} finally {
monitor.done();
TextIndexer.this.job = null;
setCreated(true);
}
return Status.OK_STATUS;
}
}
private void storeDocument(File f,HashValue hash) throws IOException {
Reader r = null;
try {
r = getReader(f);
} catch (Exception ioe) {
logger.debug("ioe -> file ignored: "+f.getName(),ioe);
r = null;
}
try {
Document doc = new Document();
doc.add(new Field(FIELD_HASH,hash.getRaw(),0,hash.getRaw().length)); //, Field.Store.YES
doc.add(new Field(FIELD_ENDING,GH.getFileEnding(f.getName()),Field.Store.YES,Index.ANALYZED));
if (r != null) {
doc.add( new Field( FIELD_CONTENT, r ));
}
w.addDocument(doc);
} finally {
GH.close(r);
}
}
private Reader getReader(File file) throws IOException {
FileInputStream input = new FileInputStream(file);
BufferedInputStream bin = new BufferedInputStream(input);
String fileending = GH.getFileEnding(file.getName());
if (fileending.equalsIgnoreCase("pdf")) {
PDDocument pdfDocument = null;
try {
// if (file.length() > MAX_TOTALSIZE/2) {
// System.gc();
// }
pdfDocument = PDDocument.load(bin, getScratchRaf(),true);
if (pdfDocument.isEncrypted()) {
return null;
}
PDFTextStripper stripper = new PDFTextStripper();
// create a writer where to append the text content.
Reader reader;
if (file.length() < MAX_RAMSIZE_FOR_PDF) {
StringWriter writer = new StringWriter();
stripper.writeText(pdfDocument, writer);
String contents = writer.getBuffer().toString();
reader = new StringReader(contents);
} else {
final File f = new File(PI.getTempPath(),"index.tmp");
FileWriter fw = new FileWriter(f);
try {
stripper.writeText(pdfDocument, fw);
} finally {
GH.close(fw);
}
FileReader fr = new FileReader(f) {
@Override
public void close() throws IOException {
super.close();
if (!f.delete()) {
f.deleteOnExit();
}
}
};
reader = fr;
}
return reader;
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
} else {
return new FileReader(file);
}
}
public boolean isCreated() {
return created;
}
private void setCreated(boolean created) {
this.created = created;
}
private RandomAccessFile getScratchRaf() throws IOException{
if (scratchRaf != null) {
scratchRaf.close();
scratch.delete();
}
scratchRaf = new RandomAccessFile(scratch,"rw");
return scratchRaf;
}
}